import numpy as np
import pandas as pd
import os
import holoviews as hv
import geoviews as gv
import datashader as ds
from colorcet import fire, rainbow, bgy, bjy, bkr, kb, kr
from datashader.colors import colormap_select, Greys9
from holoviews.streams import RangeXY
from holoviews.operation.datashader import datashade, dynspread, rasterize
from bokeh.io import push_notebook, show, output_notebook
import warnings
warnings.filterwarnings('ignore')
geo = pd.read_csv("../dataset/olist_geolocation_dataset.csv", dtype={'geolocation_zip_code_prefix': str})
# Gets the first three and four first digits of zip codes, we will explore this further to understand how zip codes works
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix'].str[0:1]
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix'].str[0:2]
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix'].str[0:3]
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix'].str[0:4]
# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo = geo[geo.geolocation_lat <= 5.27438888]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo = geo[geo.geolocation_lng >= -73.98283055]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo = geo[geo.geolocation_lat >= -33.75116944]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo = geo[geo.geolocation_lng <= -34.79314722]
from datashader.utils import lnglat_to_meters as webm
x, y = webm(geo.geolocation_lng, geo.geolocation_lat)
geo['x'] = pd.Series(x)
geo['y'] = pd.Series(y)
# transforming the prefixes to int for plotting purposes
geo['geolocation_zip_code_prefix'] = geo['geolocation_zip_code_prefix'].astype(int)
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix_1_digits'].astype(int)
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix_2_digits'].astype(int)
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix_3_digits'].astype(int)
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix_4_digits'].astype(int)
brazil = geo
agg_name = 'geolocation_zip_code_prefix'
# plot wtih holoviews + datashader - bokeh with map background
output_notebook()
hv.extension('bokeh')
%opts Overlay [width=800 height=600 toolbar='above' xaxis=None yaxis=None]
%opts QuadMesh [tools=['hover'] colorbar=True] (alpha=0 hover_alpha=0.2)
T = 0.05
PX = 1
def plot_map(data, label, agg_data, agg_name, cmap):
url="http://server.arcgisonline.com/ArcGIS/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{Z}/{Y}/{X}.png"
geomap = gv.WMTS(url)
points = hv.Points(gv.Dataset(data, kdims=['x', 'y'], vdims=[agg_name]))
agg = datashade(points, element_type=gv.Image, aggregator=agg_data, cmap=cmap)
zip_codes = dynspread(agg, threshold=T, max_px=PX)
hover = hv.util.Dynamic(rasterize(points, aggregator=agg_data, width=50, height=25, streams=[RangeXY]), operation=hv.QuadMesh)
hover = hover.options(cmap=cmap)
img = geomap * zip_codes * hover
img = img.relabel(label)
return img
orders_df = pd.read_csv('../dataset/olist_orders_dataset.csv')
order_items = pd.read_csv('../dataset/olist_order_items_dataset.csv')
order_reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv')
customer = pd.read_csv('../dataset/olist_customers_dataset.csv', dtype={'customer_zip_code_prefix': str})
# getting the first 3 digits of customer zipcode
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix'].str[0:3]
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix_3_digits'].astype(int)
brazil_geo = geo.set_index('geolocation_zip_code_prefix_3_digits').copy()
# merging the data
orders = orders_df.merge(order_items, on='order_id')
orders = orders.merge(customer, on='customer_id')
orders = orders.merge(order_reviews, on='order_id')
orders['order_delivered_customer_date'] = pd.to_datetime(orders.order_delivered_customer_date)
orders['order_estimated_delivery_date'] = pd.to_datetime(orders.order_estimated_delivery_date)
orders['order_delivered_carrier_date'] = pd.to_datetime(orders.order_delivered_carrier_date)
orders['actual_delivery_time'] = orders.order_delivered_customer_date - orders.order_delivered_carrier_date
orders['actual_delivery_time'] = orders['actual_delivery_time'].dt.days
gp = orders.groupby('customer_zip_code_prefix_3_digits')['review_score'].mean().to_frame()
score = brazil_geo.join(gp)
agg_name = 'avg_score'
score[agg_name] = score['review_score']
plot_map(score, 'Orders Average Review Score', ds.mean(agg_name), agg_name, cmap=bgy)
orders['is_delayed'] = orders['order_delivered_customer_date'] > orders['order_estimated_delivery_date']
gp = orders.groupby('customer_zip_code_prefix_3_digits').agg({'is_delayed': ['sum', 'count']})
agg_name = 'delayed'
gp[agg_name] = gp['is_delayed']['sum'] / gp['is_delayed']['count']
gp = gp[agg_name]
order_delay = brazil_geo.join(gp)
plot_map(order_delay, 'Orders Delay Percentage in Brazil', ds.mean(agg_name), agg_name, cmap=bgy)
gp = orders.groupby('customer_zip_code_prefix_3_digits')['review_score'].mean().to_frame()
gp = gp.loc[(gp['review_score'] < 3.5)]
score = brazil_geo.join(gp)
agg_name = 'avg_score'
score[agg_name] = score['review_score']
plot_map(score, 'Orders Bad Review Score', ds.mean(agg_name), agg_name, cmap=bgy)
orders['is_delayed'] = orders['order_delivered_customer_date'] > orders['order_estimated_delivery_date']
gp = orders.groupby('customer_zip_code_prefix_3_digits').agg({'is_delayed': ['sum', 'count']})
agg_name = 'delayed'
gp[agg_name] = gp['is_delayed']['sum'] / gp['is_delayed']['count']
gp = gp.loc[(gp['delayed'] > 0.2)]
gp = gp[agg_name]
order_delay = brazil_geo.join(gp)
plot_map(order_delay, 'Orders With Big Delay Percentage in Brazil', ds.mean(agg_name), agg_name, cmap=bgy)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns;
import warnings
warnings.filterwarnings('ignore')
orders = pd.read_csv('../dataset/olist_orders_dataset.csv', delimiter = ',')
customers = pd.read_csv('../dataset/olist_customers_dataset.csv', delimiter = ',')
reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv', delimiter = ',')
final = pd.merge(orders, customers, on='customer_id')
final = pd.merge(final, reviews, on='order_id')
final['is_delayed'] = final['order_delivered_customer_date'] > final['order_estimated_delivery_date']
final['order_delivered_customer_date'] = pd.to_datetime(final['order_delivered_customer_date'])
final['order_estimated_delivery_date'] = pd.to_datetime(final['order_estimated_delivery_date'])
final['time_delay'] = final['order_delivered_customer_date'] - final['order_estimated_delivery_date']
customer_delay = final.groupby('order_id', as_index=False).agg({'is_delayed': ['sum', 'count']})
customer_delay['delay'] = customer_delay['is_delayed']['sum'] / customer_delay['is_delayed']['count']
state_delay = final.groupby('customer_state', as_index=False).agg({'is_delayed': ['sum', 'count']})
state_delay['delay'] = state_delay['is_delayed']['sum'] / state_delay['is_delayed']['count']
customer_delay = customer_delay.sort_values(by='delay', ascending=False)
state_delay = state_delay.sort_values(by='delay', ascending=False)
state_reviews = final.groupby('customer_state', as_index=False)['review_score'].mean()
state_reviews = pd.merge(state_reviews, state_delay, on='customer_state')
state_reviews['delay'] = state_reviews[('delay', '')]
state_reviews['delayed'] = state_reviews[('is_delayed', 'sum')]
state_reviews['total_orders'] = state_reviews[('is_delayed', 'count')]
state_reviews = state_reviews[['customer_state','review_score','delay','delayed','total_orders']]
state_reviews = state_reviews.sort_values('delay', ascending=False)
state_reviews.head(27)
#dado.reset_index(inplace=True)
sns.set(rc={'figure.figsize':(15,13)})
ax = sns.barplot(x="delay", y="customer_state", data=(state_reviews.head(27)))
state_reviews = state_reviews.sort_values('review_score', ascending=True)
ax = sns.barplot(x="review_score", y="customer_state", data=state_reviews.head(27))
ax.set(xlim=(3.5, 4.2))
delay = final.groupby(["is_delayed"]).size().reset_index(name="amount")
delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (delay*100)),'% of all orders are delayed\n')
delay = final.loc[(final['review_score'] <= 3)]
delay = delay.groupby(["is_delayed"]).size().reset_index(name="amount")
bad_delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (bad_delay*100)),'% of the orders with bad reviews are delayed\n')
delay = final.loc[(final['review_score'] >= 4)]
delay = delay.groupby(["is_delayed"]).size().reset_index(name="amount")
good_delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (good_delay*100)),'% of the orders with good reviews are delayed \n')
print(("%.2f" % ((1-bad_delay)*100)),'% of the orders with bad review are not related to delay\n')
print(("%.2f" % ((1-good_delay)*100)),'% of the orders with good review are not delayed \n')
orders = pd.read_csv('../dataset/olist_orders_dataset.csv', delimiter = ',')
order_item = pd.read_csv('../dataset/olist_order_items_dataset.csv', delimiter = ',')
products = pd.read_csv('../dataset/olist_products_dataset.csv', delimiter = ',')
reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv', delimiter = ',')
customer = pd.read_csv('../dataset/olist_customers_dataset.csv', delimiter = ',')
orders = orders[['order_id','customer_id', 'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date']]
order_item = order_item[['order_id','product_id']]
products = products[['product_id','product_category_name']]
reviews = reviews[['order_id','review_score', 'review_creation_date']]
customer = customer[['customer_id', 'customer_city','customer_state']]
final = pd.merge(orders, order_item, on='order_id')
final = pd.merge(final, reviews, on='order_id')
final = pd.merge(final, products, on='product_id')
final = pd.merge(final, customer, on='customer_id')
final['is_delayed'] = final['order_delivered_customer_date'] > final['order_estimated_delivery_date']
reviews = final[['review_score','product_category_name', 'is_delayed']]
insatisfaction = reviews.loc[(reviews['review_score'] <= 3) & (reviews['is_delayed'] == False)]
insatisfaction = insatisfaction.groupby(["product_category_name"]).size().reset_index(name="Bad_reviews")
review = reviews.groupby(["product_category_name"]).size().reset_index(name="All_reviews")
insatisfaction = pd.merge(insatisfaction, review, on='product_category_name')
insatisfaction['Average_Insatisfaction'] = insatisfaction['Bad_reviews'] / insatisfaction['All_reviews']
insatisfaction = insatisfaction.sort_values('Average_Insatisfaction', ascending=False)
insatisfaction.head(10)
insatisfaction = reviews.loc[(reviews['review_score'] <= 3) & (reviews['is_delayed'] == True)]
insatisfaction = insatisfaction.groupby(["product_category_name"]).size().reset_index(name="Bad_reviews")
review = reviews.groupby(["product_category_name"]).size().reset_index(name="All_reviews")
insatisfaction = pd.merge(insatisfaction, review, on='product_category_name')
insatisfaction['Average_Insatisfaction'] = insatisfaction['Bad_reviews'] / insatisfaction['All_reviews']
insatisfaction = insatisfaction.sort_values('Average_Insatisfaction', ascending=False)
insatisfaction.head(10)